import os
import pandas as pd
import json
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import SGDClassifier
import json
import os
from collections import defaultdict
import numpy as np
from wordcloud import WordCloud
from matplotlib import pyplot as plt
import seaborn as sns
from langdetect import detect
from langdetect import DetectorFactory
import pickle
import spacy
from tqdm import tqdm
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
data = pd.read_csv('data.csv')
data.shape
data.columns
data.head(5)
data.describe()
data.info()
#Body text data that might be useful
data['digit_count'] = data['body_text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data['body_word_count'] = data['body_text'].apply(lambda x: len(x.strip().split())) # word count in body
data['body_unique_words']= data['body_text'].apply(lambda x:len(set(x.strip().split())))
data.drop_duplicates(['abstract', 'body_text'], inplace=True)
data['abstract'].describe(include='all')
data.dropna(inplace=True)
#Checking languages
DetectorFactory.seed = 0
languages = []
for i in range(0,len(data)):
# split by space into list, take the first x intex, join with space
text = data.iloc[i]['body_text'].split(" ")
lang = "en"
try:
if len(text) > 50:
lang = detect(" ".join(text[:50]))
elif len(text) > 0:
lang = detect(" ".join(text[:len(text)]))
except Exception as e:
all_words = set(text)
try:
lang = detect(" ".join(all_words))
except Exception as e:
try:
# let's try to label it through the abstract then
lang = detect(df.iloc[i]['abstract_summary'])
except Exception as e:
lang = "unknown"
pass
languages.append(lang)
languages_dict = {}
for lang in set(languages):
languages_dict[lang] = languages.count(lang)
data['language'] = languages
for key,value in languages_dict.items():
print(str(key) +' : ' + str(value))
fig, ax = plt.subplots(figsize=(10,7))
fig.size = (40,20)
graph = plt.bar(range(len(languages_dict)), list(languages_dict.values()))
plt.xticks(range(len(languages_dict)), list(languages_dict.keys()))
#plt.pie(list(languages_dict.values()), labels=list(languages_dict.keys()))
ax.set_title("Languages dsitribution in Dataset")
ax.set_ylabel("Number of Articles")
def autolabel(rects):
"""Attach a text label above each bar in *rects*, displaying its height."""
for rect in rects:
height = rect.get_height()
ax.annotate('{}'.format(height),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
autolabel(graph)
plt.savefig("language_distribution.png")
plt.show()
#let's drop all non-english languages!
data = data[data['language'] == 'en']
data.info()
sum(data['body_unique_words'])/len(data['body_unique_words'])
sum(data['body_word_count'])/len(data['body_word_count'])
sns.set(rc={'figure.figsize':(8,6)})
ax = sns.distplot(data['body_unique_words'],150)
max_x = 12000
ax.set_xlim(0,max_x)
ax.set_xticks(range(0,max_x,1500))
plt.title("Unique Word Count Distribution")
ax.set_ylabel("Frequency")
ax.set_xlabel('Number of Unique Words')
plt.savefig("unique_words.png")
sns.set(rc={'figure.figsize':(8,6)})
ax = sns.distplot(data['body_word_count'],300)
max_x = 30000
ax.set_xlim(0,max_x)
ax.set_xticks(range(0,max_x,2500))
ax.set_xlabel("Word Count")
ax.set_ylabel("Frequency")
plt.title("Word Count Distribution")
plt.savefig("word_count.png")
factors = {
'alcohol':['alcohol','drinking','toxic','drink'],
'heart':['heart disease','heart failure','blood pressure','hypertension'],
'gender':['gender','sex','male','female'],
'weight':['obese','overweight','weight'],
'lung':['smoke','smoking','cigarettes','tabacco','lung','respitory','athsma'],
'age':['age','elderly','senior','adult','dementia']
}
# risk['tags'] = 0
f = ['alcohol','heart','gender','weight','lung','age']
def tag(df,d):
for factor,topics in d.items():
for topic in topics:
df[factor] = np.nan
df[factor] = np.where(df['body_text_new'].str.contains(topic, case=False, na=False), 1, '')
# def tag(df,d):
# for index, row in df.iterrows():
# tags = []
# for factor,words in factors.items():
# for word in words:
# if word in row['body_text_new']:
# risk.loc[index,factor] = 1
# if factor not in tags:
# tags.append(factor)
# risk.loc[index,'tags'] = [tags]
return df
data = tag(data,factors)
data = data.fillna(0)
data.head(5)
risk = pd.melt(data,
value_vars = f,
var_name = 'tag',
value_name = 'count')
risk = risk[risk['count'] == '1']
risk.head()
sns.set(rc={'figure.figsize':(8,6)})
ax = sns.countplot(data=risk,x='tag')
# ax.set_ylim(43670,43675)
plt.title("Risk Tags")
ax.set_ylabel("Frequency")
ax.set_xlabel('Tags')
plt.savefig("tags.png")
risk_words = ['risk','estimat','characteristic','factors','features','study','predict','clinic']
def risk_tag(df,l):
for item in l:
df['risk'] = np.where(df['title'].str.contains(item, case=False, na=False), 1, '')
return df
data['risk'] = np.nan
data = risk_tag(data,risk_words)
data = data.fillna(0)
data.head(5)
print('There are {} articles that are tagged as risk articles'.format(len(data[data['risk']=='1'])))
# Thought this would be useful but it wasn't
quantile_95 = data['body_word_count'].quantile(0.95)
df_95 = data[data['body_word_count'] < quantile_95]
plt.figure(figsize=(12.8,6))
sns.distplot(df_95['body_word_count']).set_title('Text Body Word Count');
data.to_csv('features.csv',index=False)
data = pd.read_csv('features.csv')
data.head(5)
def vectorize(text, maxfeatures):
vectorizer = TfidfVectorizer(max_df=0.1,
min_df = 0.05,
ngram_range = (1,2),
max_features=maxfeatures)
X = vectorizer.fit_transform(text)
return X
text = data['body_text_new'].values
X = vectorize(text, 2 ** 10)
X.shape
#We obviously want to reduce our dimensionality
#We will do this using PCA to hopefully make kmeans clustering a little easier
pca = PCA(n_components=0.9)
X_reduced= pca.fit_transform(X.toarray())
X_reduced.shape
#So before we run kmeans we need to find what k we should use
#We'll compute the distortion value (distance from center of cluster)
# run kmeans with many different k
# distortions = []
# K = range(4, 40)
# for k in K:
# k_means = KMeans(n_clusters=k).fit(X_reduced)
# distortions.append(k_means.inertia_)
k_means = KMeans()
visualizer = KElbowVisualizer(k_means, k=(4,30))
visualizer.fit(X_reduced) # Fit the data to the visualizer
visualizer.show()
# visualizer.savefig("elbow.png")
# X_line = [K[0], K[-1]]
# Y_line = [distortions[0], distortions[-1]]
# # Plot the elbow
# plt.plot(K, distortions, 'r')
# plt.plot(X_line, Y_line, 'b')
# plt.xlabel('k')
# plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k')
# plt.show()
#So it looks like a good k is 17!
#Let's run kmeans with 17
k = 17
kmeans = KMeans(n_clusters=k,random_state=0)
y_predictions = kmeans.fit_predict(X_reduced)
data['y_predictions'] = y_predictions
print('Done')
tsne = TSNE(verbose=1, perplexity=50)
X_embedded = tsne.fit_transform(X.toarray())
sns.set(rc={'figure.figsize':(15,15)})
palette = sns.hls_palette(k, l=.5, s=.9)
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_predictions, legend='full', palette=palette)
plt.title('T-SNE with Kmeans Labels')
plt.savefig("improved_cluster_tsne.png")
plt.show()
vectorizers = []
for i in range(k):
# Creating a vectorizer
vectorizers.append(CountVectorizer(min_df=5, max_df=0.9,
stop_words='english',
lowercase=True,
token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'))
vectorized_data = []
for cluster,vectorizer in enumerate(vectorizers):
vectorized_data.append(vectorizer.fit_transform(data.loc[data['y_predictions'] == cluster, 'body_text_new']))
num_topics = 5
lda_models = []
for num in range(k):
lda = LatentDirichletAllocation(n_components=num_topics,
learning_method='online',
verbose=False)
lda_models.append(lda)
clusters_lda_data = []
for cluster, lda in enumerate(lda_models):
clusters_lda_data.append((lda.fit(vectorized_data[cluster])))
def print_topics(model, count_vectorizer, n_top_words=10):
top_topics = []
words = count_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
print("\nTopic #%d:" % topic_idx)
topic = " ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
top_topics.append(topic)
print(topic)
return top_topics
def selected_topics(model, vectorizer, top_n=4):
current_words = []
keywords = []
for idx, topic in enumerate(model.components_):
words = [(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
for word in words:
if word[0] not in current_words:
keywords.append(word)
current_words.append(word[0])
keywords.sort(key = lambda x: x[1])
keywords.reverse()
return_values = []
for ii in keywords:
return_values.append(ii[0])
return return_values
all_keywords = []
for current_vectorizer, lda in enumerate(lda_models):
all_keywords.append(selected_topics(lda, vectorizers[current_vectorizer]))
#Let's write these words to a file
with open('topics.txt','w') as fout:
for topic_num, words in enumerate(all_keywords):
fout.write('Topic ' + str(topic_num) + ':\n' + ', '.join(words) + '\n\n')
print(('Topic ' + str(topic_num) + ':\n' + ', '.join(words) + '\n'))
sns.set(rc={'figure.figsize':(8,6)})
ax = sns.distplot(data['y_predictions'],150)
max_x = k
ax.set_xlim(0,max_x)
ax.set_xticks(range(0,max_x,1))
plt.title("Cluster Distribution")
ax.set_ylabel("Frequency")
ax.set_xlabel('Cluster Number')
plt.savefig("cluster_dist.png")
#Let's save all the files that took forever to run
data.to_csv('data1.csv',index=False)
pickle.dump(y_predictions, open("y_predictions.p", "wb" ))
data = pd.read_csv('data1.csv')
data.head()
#Let's manutally look at the topics and assign them overarching topics
y_topics = ['coronavirus genome',
'clinical treatment',
'transmission simulation',
'virus rna',
'cell binding',
'virus sampling',
'animals and viruses',
'vaccine compounds',
'infections',
'virus detection',
'risk factors',
'similar diseases',
'animal transmission',
'children and athsma'
'death rates',
'lung issues',
'testing']
#For risk factors, the revelant clusters are 3, but with greater focus on 13
#So the articles that talk about risk are y_predictions == 13
#Let's first look at all the articles with specific words associated with risk included
#in the title to check which clusters we should do further analysis on
risk_words = ['risk','estimat','characteristic','factors','features','study','predict','clinic']
def contains(df,word):
return df[df['title'].str.contains(word)]
risk_data = pd.DataFrame()
for word in risk_words:
out = contains(data,word)
p = [risk_data,out]
risk_data = pd.concat(p,join='outer')
# new_data.head(5)
# len(risk_cluster)
# risk_cluster = data[data['y_predictions']==10]
len(risk_data)
sns.set(rc={'figure.figsize':(8,6)})
ax = sns.distplot(risk_data['y_predictions'],150)
max_x = k
ax.set_xlim(0,max_x)
ax.set_xticks(range(0,max_x,1))
plt.title("Clusters Containing Risk Information")
ax.set_ylabel("Frequency")
ax.set_xlabel('Cluster Number')
plt.savefig("risk_cluster_dist.png")
def images(df,col,match):
print(col)
tt = df[df[col]==match]
v = TfidfVectorizer(max_df=0.1, min_df = 0.05,max_features = 10000,ngram_range = (1,2))
x = v.fit_transform(tt['body_text_new'])
freqs = dict()
for word, idx in v.vocabulary_.items():
freqs[word] = x.getcol(idx).sum()
w = WordCloud(width=800,height=600,mode='RGBA',background_color='white',max_words=2000).fit_words(freqs)
plt.figure(figsize=(20,10))
plt.imshow(w)
plt.savefig(col + ".png")
for word in f:
try:
images(data,word,1)
except ValueError:
pass
# function to print out classification model report
def report(model_name, test, pred):
print(model_name, ":\n")
print("Accuracy Score: ", '{:,.3f}'.format(float(accuracy_score(test, pred)) * 100), "%")
print(" Precision: ", '{:,.3f}'.format(float(precision_score(test, pred, average='macro')) * 100), "%")
print(" Recall: ", '{:,.3f}'.format(float(recall_score(test, pred, average='macro')) * 100), "%")
print(" F1 score: ", '{:,.3f}'.format(float(f1_score(test, pred, average='macro')) * 100), "%")
X_train, X_test, y_train, y_test = train_test_split(X.toarray(),y_predictions, test_size=0.2)
print("X_train size:", len(X_train))
print("X_test size:", len(X_test), "\n")
from sklearn.dummy import DummyClassifier
smp_clf = DummyClassifier(strategy="most_frequent")
# train SGD
smp_clf.fit(X_train, y_train)
# cross validation predictions
# sgd_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3, n_jobs=4)
smp_clf = cross_val_predict(smp_clf, X_train, y_train, cv=3, n_jobs=4)
# print out the classification report
report("Majority Classifier (Training Set)", y_train, smp_clf)
# SGD instance
from sklearn.naive_bayes import ComplementNB
# sgd_clf = SGDClassifier(max_iter=1000, random_state=0)
cnb_clf = ComplementNB()
# train SGD
cnb_clf.fit(X_train, y_train)
# cross validation predictions
# sgd_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3, n_jobs=4)
cnb_clf = cross_val_predict(cnb_clf, X_train, y_train, cv=3, n_jobs=4)
# print out the classification report
report("Complement Naive Bayes (Training Set)", y_train, cnb_clf)
# SGD instance
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=0)
# cnb_clf = ComplementNB()
# train SGD
sgd_clf.fit(X_train, y_train)
# cross validation predictions
# sgd_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3, n_jobs=4)
sgd_clf_p = cross_val_predict(sgd_clf, X_train, y_train, cv=3, n_jobs=4)
# print out the classification report
report("Stochastic Gradient Descent:", y_train, sgd_clf_p)
pickle.dump(sgd_clf, open('sgd_clf.pkl','wb'))